From 9aed0e5eb66c57e4352a5bc293d157eadba77abd Mon Sep 17 00:00:00 2001
From: Ell <ell_se@yahoo.com>
Date: Tue, 7 Nov 2017 15:44:39 -0500
Subject: [PATCH] babl, sse2-float: fall back to slow, accurate path for large
 pow-2.4 inputs

The approximations we use for pow_24() and pow_1_24() diverge from
the actual function for large-enough input values.  This can lead
not just to inaccurate results, but also to infinities and NaNs,
especially when multiple conversions are strung in a row.

When the input value is large enough to produce notable divergence
(the difference between the approximate and actual values is ~1% at
the chosen limits,) fall back to a slower, but more accurate
version.

For the SSE2 float conversions, this results in an increase of ~5%
in conversion time, when all values are below the limit.  When most
values are above the limit, performance can be 10x slower or worse.
---
 babl/base/pow-24.c      | 28 ++++++++++++++++++++++++----
 babl/base/pow-24.h      | 28 ++++++++++++++++++++++++----
 extensions/sse2-float.c | 30 ++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c
index dbc4071..5a4f0ad 100644
--- a/babl/base/pow-24.c
+++ b/babl/base/pow-24.c
@@ -48,8 +48,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2)
 double
 babl_pow_24 (double x)
 {
-  double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  double y;
   int i;
+  if (x > 16.0) {
+    /* for large values, fall back to a slower but more accurate version */
+    return exp (log (x) * 2.4);
+  }
+  y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
   for (i = 0; i < 3; i++)
     y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
   x *= y;
@@ -61,9 +66,14 @@ babl_pow_24 (double x)
 double
 babl_pow_1_24 (double x)
 {
-  double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+  double y;
   int i;
   double z;
+  if (x > 1024.0) {
+    /* for large values, fall back to a slower but more accurate version */
+    return exp (log (x) * (1.0 / 2.4));
+  }
+  y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
   x = sqrt (x);
   /* newton's method for x^(-1/6) */
   z = (1./6.) * x;
@@ -102,8 +112,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2)
 float
 babl_pow_24f (float x)
 {
-  float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+  float y;
   int i;
+  if (x > 16.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return expf (logf (x) * 2.4f);
+  }
+  y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
   for (i = 0; i < 3; i++)
     y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
   x *= y;
@@ -115,9 +130,14 @@ babl_pow_24f (float x)
 float
 babl_pow_1_24f (float x)
 {
-  float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+  float y;
   int i;
   float z;
+  if (x > 1024.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return expf (logf (x) * (1.0f / 2.4f));
+  }
+  y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
   x = sqrtf (x);
   /* newton's method for x^(-1/6) */
   z = (1.f/6.f) * x;
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index a55c029..ed42cfe 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -54,8 +54,13 @@ init_newton (double x, double exponent, double c0, double c1, double c2)
 static inline double
 babl_pow_24 (double x)
 {
-  double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  double y;
   int i;
+  if (x > 16.0) {
+    /* for large values, fall back to a slower but more accurate version */
+    return exp (log (x) * 2.4);
+  }
+  y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
   for (i = 0; i < 3; i++)
     y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
   x *= y;
@@ -67,9 +72,14 @@ babl_pow_24 (double x)
 static inline double
 babl_pow_1_24 (double x)
 {
-  double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+  double y;
   int i;
   double z;
+  if (x > 1024.0) {
+    /* for large values, fall back to a slower but more accurate version */
+    return exp (log (x) * (1.0 / 2.4));
+  }
+  y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
   x = sqrt (x);
   /* newton's method for x^(-1/6) */
   z = (1./6.) * x;
@@ -133,8 +143,13 @@ init_newtonf (float x, float exponent, float c0, float c1, float c2)
 static inline float
 babl_pow_24f (float x)
 {
-  float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+  float y;
   int i;
+  if (x > 16.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return expf (logf (x) * 2.4f);
+  }
+  y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
   for (i = 0; i < 3; i++)
     y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
   x *= y;
@@ -146,9 +161,14 @@ babl_pow_24f (float x)
 static inline float
 babl_pow_1_24f (float x)
 {
-  float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+  float y;
   int i;
   float z;
+  if (x > 1024.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return expf (logf (x) * (1.0f / 2.4f));
+  }
+  y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
   x = sqrtf (x);
   /* newton's method for x^(-1/6) */
   z = (1.f/6.f) * x;
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index d26073c..223f85c 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -237,6 +237,22 @@ conv_rgbAF_linear_rgbaF_linear_spin (const Babl *conversion,const float *src, fl
 #define FLT_ONE 0x3f800000 // ((union {float f; int i;}){1.0f}).i
 #define FLT_MANTISSA (1<<23)
 
+static inline float
+sse_max_component (__v4sf x) {
+  __v4sf s;
+  __v4sf m;
+
+  /* m = [max (x[3], x[1]), max (x[2], x[0])] */
+  s = (__v4sf) _mm_shuffle_epi32 ((__m128i) x, _MM_SHUFFLE(0, 0, 3, 2));
+  m = _mm_max_ps (x, s);
+
+  /* m = [max (m[1], m[0])] = [max (max (x[3], x[1]), max (x[2], x[0]))] */
+  s = (__v4sf) _mm_shuffle_epi32 ((__m128i) m, _MM_SHUFFLE(0, 0, 0, 1));
+  m = _mm_max_ps (m, s);
+
+  return m[0];
+}
+
 static inline __v4sf
 sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
 {
@@ -249,6 +265,13 @@ static inline __v4sf
 sse_pow_1_24 (__v4sf x)
 {
   __v4sf y, z;
+  if (sse_max_component (x) > 1024.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return _mm_set_ps (expf (logf (x[3]) * (1.0f / 2.4f)),
+                       expf (logf (x[2]) * (1.0f / 2.4f)),
+                       expf (logf (x[1]) * (1.0f / 2.4f)),
+                       expf (logf (x[0]) * (1.0f / 2.4f)));
+  }
   y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
   x = _mm_sqrt_ps (x);
   /* newton's method for x^(-1/6) */
@@ -262,6 +285,13 @@ static inline __v4sf
 sse_pow_24 (__v4sf x)
 {
   __v4sf y, z;
+  if (sse_max_component (x) > 16.0f) {
+    /* for large values, fall back to a slower but more accurate version */
+    return _mm_set_ps (expf (logf (x[3]) * 2.4f),
+                       expf (logf (x[2]) * 2.4f),
+                       expf (logf (x[1]) * 2.4f),
+                       expf (logf (x[0]) * 2.4f));
+  }
   y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
   /* newton's method for x^(-1/5) */
   z = splat4f (1.f/5.f) * x;
-- 
2.30.2